En este notebook se realiza un análisis exploratorio inicial (EDA) de múltiples conjuntos de datos candidatos a complementar el conjunto de datos final utilizado en el proyecto. Los conjuntos de datos a analizar son los siguientes:
from pathlib import Path
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import imagesize
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import json
from waste_detection_system import shared_data as base, utils, dataset_creator
# plot style
# ==============================================================================
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['figure.titlesize'] = 16
Se trata de un conjunto de datos en un contexto menos ideal (imágenes capturadas con cámaras variadas en entornos mayormente exteriores), con una o varias anotaciones por imagen. Las categorías reflejadas en este conjunto de datos son:
Las anotaciones se encuentran en formato COCO.
images = {
'name' : [],
'path' : [],
'width' : [],
'height' : [],
'label' : [],
'bbox-x' : [],
'bbox-y' : [],
'bbox-w' : [],
'bbox-h' : [],
}
json_file = {}
with open(base.TACO / 'annotations.json', 'r') as _file:
json_file = json.load(_file)
for image in json_file['images']:
anns = [item for item in json_file['annotations']
if item['image_id'] == image['id']]
for ann in anns:
cat = [item for item in json_file['categories']
if item['id'] == ann['category_id']][0]
images['name'] = images['name'] + [image['file_name']]
images['path'] = images['path'] + [str(base.TACO / image['file_name'])]
images['width'] = images['width'] + [image['width']]
images['height'] = images['height'] + [image['height']]
images['label'] = images['label'] + [cat['name']]
bbox = ann['bbox']
images['bbox-x'] = images['bbox-x'] + [bbox[0]]
images['bbox-y'] = images['bbox-y'] + [bbox[1]]
images['bbox-w'] = images['bbox-w'] + [bbox[2]]
images['bbox-h'] = images['bbox-h'] + [bbox[3]]
images_df = pd.DataFrame(images)
images_df['label'] = [base.RELATION_CATS[label.upper()] for label in images_df['label']]
train, test = train_test_split(images_df, test_size=0.2,
stratify=images_df[['label']])
train, val = train_test_split(train, test_size=0.15,
stratify=train[['label']])
train['type'] = 'train'
val['type'] = 'val'
test['type'] = 'test'
images_df = pd.concat([train, val, test])
images_df.head(n=10)
len(images_df.index)
images_df.info()
images_df['type'].value_counts()
images_df['type'].value_counts(normalize=True)
images_df['label'].value_counts(normalize=True)
sample_imgs = images_df[(images_df.type == 'train')].sample(n=3)
utils.plot_data_sample(sample_imgs, images_df)
with open(base.TACO_CSV, 'w', encoding='utf-8-sig') as f:
images_df.to_csv(f, index=False)
Se trata de un conjunto de imágenes en contexto plano (menos ideal) de botellas de vidrio, tetrapacks, botellas de plástico y latas de aluminio. La mayoría de las imágenes solo contiene un objeto, aunque hay algunas que tienen varios objetos.También cabe destacar que este conjunto de datos está aumentado, es decir, la misma imagene se repite en diferentes rotaciones y espejada.
Las anotaciones se encuentran en TXTs individuales en formato YOLO.
img_files = []
txt_files = []
img_txt = {}
for dirpath, dirs, filenames in os.walk(base.DRINKING_WASTE):
txt_files = txt_files + [Path(dirpath)/filename for filename
in filenames if filename.endswith('.txt')]
img_files = img_files + [Path(dirpath)/filename for filename
in filenames if not filename.endswith('.txt')]
for txt_f in txt_files:
associated_img = [img for img in img_files if txt_f.stem == img.stem][0]
img_txt[associated_img] = txt_f
unannotated = [img for img in img_files if not img in img_txt.keys()]
print(f'Imágenes sin fichero de anotaciones: {len(unannotated)}')
print(*unannotated, sep='\n')
images = {
'name' : [],
'path' : [],
'width' : [],
'height' : [],
'label' : [],
'bbox-x' : [],
'bbox-y' : [],
'bbox-w' : [],
'bbox-h' : [],
}
for img, txt in img_txt.items():
w, h = imagesize.get(str(img))
df = pd.read_csv(str(txt), delim_whitespace=True, header=None,
names=['label', 'x', 'y', 'width', 'height'])
bb_x, bb_y, bb_w, bb_h = utils.yolo2coco(df.x.iloc[0],
df.y.iloc[0], df.width.iloc[0], df.height.iloc[0], w, h)
images['name'] = images['name'] + [img.name]
images['path'] = images['path'] + [str(img)]
images['width'] = images['width'] + [w]
images['height'] = images['height'] + [h]
images['label'] = images['label'] + [base.RELATION_CATS[df.label.iloc[0].astype(str)]]
images['bbox-x'] = images['bbox-x'] + [bb_x]
images['bbox-y'] = images['bbox-y'] + [bb_y]
images['bbox-w'] = images['bbox-w'] + [bb_w]
images['bbox-h'] = images['bbox-h'] + [bb_h]
images_df = pd.DataFrame(images)
train, test = train_test_split(images_df, test_size=0.2,
stratify=images_df[['label']])
train, val = train_test_split(train, test_size=0.15,
stratify=train[['label']])
train['type'] = 'train'
val['type'] = 'val'
test['type'] = 'test'
images_df = pd.concat([train, val, test])
images_df.head(n=10)
len(images_df.index)
images_df.info()
images_df['type'].value_counts()
images_df['type'].value_counts(normalize=True)
images_df['label'].value_counts(normalize=True)
sample_imgs = images_df[(images_df.type == 'train')].sample(n=5)
utils.plot_data_sample(sample_imgs, images_df)